Clustering

Some customers didn't write a review. But why are they happy or mad?

In [1]:
import numpy as np
import pandas as pd 
import os
import holoviews as hv
import geoviews as gv
import datashader as ds
from colorcet import fire, rainbow, bgy, bjy, bkr, kb, kr
from datashader.colors import colormap_select, Greys9
from holoviews.streams import RangeXY
from holoviews.operation.datashader import datashade, dynspread, rasterize
from bokeh.io import push_notebook, show, output_notebook
import warnings

warnings.filterwarnings('ignore')

geo = pd.read_csv("../dataset/olist_geolocation_dataset.csv", dtype={'geolocation_zip_code_prefix': str})

# Gets the first three and four first digits of zip codes, we will explore this further to understand how zip codes works
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix'].str[0:1]
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix'].str[0:2]
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix'].str[0:3]
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix'].str[0:4]

# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo = geo[geo.geolocation_lat <= 5.27438888]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo = geo[geo.geolocation_lng >= -73.98283055]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo = geo[geo.geolocation_lat >= -33.75116944]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo = geo[geo.geolocation_lng <=  -34.79314722]



from datashader.utils import lnglat_to_meters as webm
x, y = webm(geo.geolocation_lng, geo.geolocation_lat)
geo['x'] = pd.Series(x)
geo['y'] = pd.Series(y)

# transforming the prefixes to int for plotting purposes
geo['geolocation_zip_code_prefix'] = geo['geolocation_zip_code_prefix'].astype(int)
geo['geolocation_zip_code_prefix_1_digits'] = geo['geolocation_zip_code_prefix_1_digits'].astype(int)
geo['geolocation_zip_code_prefix_2_digits'] = geo['geolocation_zip_code_prefix_2_digits'].astype(int)
geo['geolocation_zip_code_prefix_3_digits'] = geo['geolocation_zip_code_prefix_3_digits'].astype(int)
geo['geolocation_zip_code_prefix_4_digits'] = geo['geolocation_zip_code_prefix_4_digits'].astype(int)

brazil = geo
agg_name = 'geolocation_zip_code_prefix'

# plot wtih holoviews + datashader - bokeh with map background
output_notebook()
hv.extension('bokeh')

%opts Overlay [width=800 height=600 toolbar='above' xaxis=None yaxis=None]
%opts QuadMesh [tools=['hover'] colorbar=True] (alpha=0 hover_alpha=0.2)

T = 0.05
PX = 1

def plot_map(data, label, agg_data, agg_name, cmap):
    url="http://server.arcgisonline.com/ArcGIS/rest/services/Canvas/World_Dark_Gray_Base/MapServer/tile/{Z}/{Y}/{X}.png"
    geomap = gv.WMTS(url)
    points = hv.Points(gv.Dataset(data, kdims=['x', 'y'], vdims=[agg_name]))
    agg = datashade(points, element_type=gv.Image, aggregator=agg_data, cmap=cmap)
    zip_codes = dynspread(agg, threshold=T, max_px=PX)
    hover = hv.util.Dynamic(rasterize(points, aggregator=agg_data, width=50, height=25, streams=[RangeXY]), operation=hv.QuadMesh)
    hover = hover.options(cmap=cmap)
    img = geomap * zip_codes * hover
    img = img.relabel(label)
    return img

orders_df = pd.read_csv('../dataset/olist_orders_dataset.csv')
order_items = pd.read_csv('../dataset/olist_order_items_dataset.csv')
order_reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv')
customer = pd.read_csv('../dataset/olist_customers_dataset.csv', dtype={'customer_zip_code_prefix': str})

# getting the first 3 digits of customer zipcode
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix'].str[0:3]
customer['customer_zip_code_prefix_3_digits'] = customer['customer_zip_code_prefix_3_digits'].astype(int)

brazil_geo = geo.set_index('geolocation_zip_code_prefix_3_digits').copy()

# merging the data
orders = orders_df.merge(order_items, on='order_id')
orders = orders.merge(customer, on='customer_id')
orders = orders.merge(order_reviews, on='order_id')

orders['order_delivered_customer_date'] = pd.to_datetime(orders.order_delivered_customer_date)
orders['order_estimated_delivery_date'] = pd.to_datetime(orders.order_estimated_delivery_date)
orders['order_delivered_carrier_date'] = pd.to_datetime(orders.order_delivered_carrier_date)
orders['actual_delivery_time'] = orders.order_delivered_customer_date - orders.order_delivered_carrier_date
orders['actual_delivery_time'] = orders['actual_delivery_time'].dt.days

gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']

plot_map(score, 'Orders Average Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
Loading BokehJS ...
Out[1]:
In [2]:
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date'] 
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)

plot_map(order_delay, 'Orders Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
Out[2]:
In [3]:
gp = orders.groupby('customer_zip_code_prefix_3_digits')['review_score'].mean().to_frame()
gp = gp.loc[(gp['review_score'] < 3.5)]
score = brazil_geo.join(gp)
agg_name = 'avg_score'
score[agg_name] = score['review_score']

plot_map(score, 'Orders Bad Review Score', ds.mean(agg_name), agg_name, cmap=bgy)
Out[3]:
In [4]:
orders['is_delayed'] = orders['order_delivered_customer_date'] > orders['order_estimated_delivery_date'] 
gp = orders.groupby('customer_zip_code_prefix_3_digits').agg({'is_delayed': ['sum', 'count']})
agg_name = 'delayed'
gp[agg_name] = gp['is_delayed']['sum'] / gp['is_delayed']['count']
gp = gp.loc[(gp['delayed'] > 0.2)]
gp = gp[agg_name]
order_delay = brazil_geo.join(gp)

plot_map(order_delay, 'Orders With Big Delay Percentage in Brazil', ds.mean(agg_name), agg_name, cmap=bgy)
Out[4]:
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns;
import warnings

warnings.filterwarnings('ignore')

orders = pd.read_csv('../dataset/olist_orders_dataset.csv', delimiter = ',')
customers = pd.read_csv('../dataset/olist_customers_dataset.csv', delimiter = ',')
reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv', delimiter = ',')

final = pd.merge(orders, customers, on='customer_id')
final = pd.merge(final, reviews, on='order_id')

final['is_delayed'] = final['order_delivered_customer_date'] > final['order_estimated_delivery_date']

final['order_delivered_customer_date'] = pd.to_datetime(final['order_delivered_customer_date'])
final['order_estimated_delivery_date'] = pd.to_datetime(final['order_estimated_delivery_date'])
final['time_delay'] = final['order_delivered_customer_date'] - final['order_estimated_delivery_date']

customer_delay = final.groupby('order_id', as_index=False).agg({'is_delayed': ['sum', 'count']})
customer_delay['delay'] = customer_delay['is_delayed']['sum'] / customer_delay['is_delayed']['count']

state_delay = final.groupby('customer_state', as_index=False).agg({'is_delayed': ['sum', 'count']})
state_delay['delay'] = state_delay['is_delayed']['sum'] / state_delay['is_delayed']['count']

customer_delay = customer_delay.sort_values(by='delay', ascending=False)
state_delay = state_delay.sort_values(by='delay', ascending=False)

state_reviews = final.groupby('customer_state', as_index=False)['review_score'].mean()
state_reviews = pd.merge(state_reviews, state_delay, on='customer_state')
state_reviews['delay'] = state_reviews[('delay', '')]
state_reviews['delayed'] = state_reviews[('is_delayed', 'sum')]
state_reviews['total_orders'] = state_reviews[('is_delayed', 'count')]
state_reviews = state_reviews[['customer_state','review_score','delay','delayed','total_orders']]

state_reviews = state_reviews.sort_values('delay', ascending=False)

state_reviews.head(27)
Out[5]:
customer_state review_score delay delayed total_orders
1 AL 3.731415 0.230216 96.0 417
9 MA 3.745672 0.187750 141.0 751
16 PI 3.907258 0.153226 76.0 496
5 CE 3.837939 0.147125 197.0 1339
24 SE 3.800000 0.145714 51.0 350
4 BA 3.834314 0.135668 461.0 3398
18 RJ 3.853442 0.129466 1674.0 12930
26 TO 4.100000 0.125000 35.0 280
7 ES 4.009790 0.119922 245.0 2043
13 PA 3.827902 0.119145 117.0 982
11 MS 4.115702 0.112948 82.0 726
21 RR 3.608696 0.108696 5.0 46
14 PB 4.007449 0.106145 57.0 537
19 RN 4.075975 0.104723 51.0 487
15 PE 3.993393 0.103303 172.0 1665
23 SC 4.055875 0.095316 348.0 3651
8 GO 4.026019 0.079529 162.0 2037
22 RS 4.125658 0.070093 386.0 5507
6 DF 4.056019 0.068981 149.0 2160
12 MT 4.087912 0.065934 60.0 910
25 SP 4.160579 0.057021 2393.0 41967
10 MG 4.121049 0.054758 641.0 11706
17 PR 4.167654 0.048973 248.0 5064
3 AP 4.176471 0.044118 3.0 68
2 AM 4.154362 0.040268 6.0 149
0 AC 4.049383 0.037037 3.0 81
20 RO 4.055336 0.027668 7.0 253

States with biggest Average Delay (%)

In [6]:
#dado.reset_index(inplace=True)
sns.set(rc={'figure.figsize':(15,13)})
ax = sns.barplot(x="delay", y="customer_state", data=(state_reviews.head(27)))

States with biggest Average Reviews

In [7]:
state_reviews = state_reviews.sort_values('review_score', ascending=True)
ax = sns.barplot(x="review_score", y="customer_state", data=state_reviews.head(27))
ax.set(xlim=(3.5, 4.2))
Out[7]:
[(3.5, 4.2)]
In [8]:
delay = final.groupby(["is_delayed"]).size().reset_index(name="amount")
delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (delay*100)),'% of all orders are delayed\n')

delay = final.loc[(final['review_score'] <= 3)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
bad_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (bad_delay*100)),'% of the orders with bad reviews are delayed\n')

delay = final.loc[(final['review_score'] >= 4)]
delay = delay.groupby(["is_delayed"]).size().reset_index(name="amount")
good_delay = delay.iloc[1][1] / delay.iloc[0][1]
print(("%.2f" % (good_delay*100)),'% of the orders with good reviews are delayed \n')

print(("%.2f" % ((1-bad_delay)*100)),'% of the orders with bad review are not related to delay\n')
print(("%.2f" % ((1-good_delay)*100)),'% of the orders with good review are not delayed \n')
8.54 % of all orders are delayed

28.51 % of the orders with bad reviews are delayed

3.62 % of the orders with good reviews are delayed 

71.49 % of the orders with bad review are not related to delay

96.38 % of the orders with good review are not delayed 

Product categories with bad review (without delay)

In [9]:
orders = pd.read_csv('../dataset/olist_orders_dataset.csv', delimiter = ',')
order_item = pd.read_csv('../dataset/olist_order_items_dataset.csv', delimiter = ',')
products = pd.read_csv('../dataset/olist_products_dataset.csv', delimiter = ',')
reviews = pd.read_csv('../dataset/olist_order_reviews_dataset.csv', delimiter = ',')
customer = pd.read_csv('../dataset/olist_customers_dataset.csv', delimiter = ',')

orders = orders[['order_id','customer_id', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date']]
order_item = order_item[['order_id','product_id']]
products = products[['product_id','product_category_name']]
reviews = reviews[['order_id','review_score', 'review_creation_date']]
customer = customer[['customer_id', 'customer_city','customer_state']]

final = pd.merge(orders, order_item, on='order_id')
final = pd.merge(final, reviews, on='order_id')
final = pd.merge(final, products, on='product_id')
final = pd.merge(final, customer, on='customer_id')
final['is_delayed'] = final['order_delivered_customer_date'] > final['order_estimated_delivery_date']

reviews = final[['review_score','product_category_name', 'is_delayed']]
insatisfaction = reviews.loc[(reviews['review_score'] <= 3) & (reviews['is_delayed'] == False)]
insatisfaction = insatisfaction.groupby(["product_category_name"]).size().reset_index(name="Bad_reviews")

review = reviews.groupby(["product_category_name"]).size().reset_index(name="All_reviews")

insatisfaction = pd.merge(insatisfaction, review, on='product_category_name')
insatisfaction['Average_Insatisfaction'] = insatisfaction['Bad_reviews'] / insatisfaction['All_reviews']
insatisfaction = insatisfaction.sort_values('Average_Insatisfaction', ascending=False)
insatisfaction.head(10)
Out[9]:
product_category_name Bad_reviews All_reviews Average_Insatisfaction
42 fraldas_higiene 21 39 0.538462
67 seguros_e_servicos 1 2 0.500000
65 portateis_cozinha_e_preparadores_de_alimentos 7 15 0.466667
55 moveis_escritorio 586 1701 0.344503
60 pc_gamer 3 9 0.333333
5 artigos_de_festas 13 43 0.302326
71 telefonia_fixa 80 265 0.301887
36 fashion_roupa_feminina 15 50 0.300000
38 fashion_roupa_masculina 39 132 0.295455
46 la_cuisine 4 14 0.285714

Product categories with bad review (with delay)

In [10]:
insatisfaction = reviews.loc[(reviews['review_score'] <= 3) & (reviews['is_delayed'] == True)]
insatisfaction = insatisfaction.groupby(["product_category_name"]).size().reset_index(name="Bad_reviews")

review = reviews.groupby(["product_category_name"]).size().reset_index(name="All_reviews")

insatisfaction = pd.merge(insatisfaction, review, on='product_category_name')
insatisfaction['Average_Insatisfaction'] = insatisfaction['Bad_reviews'] / insatisfaction['All_reviews']
insatisfaction = insatisfaction.sort_values('Average_Insatisfaction', ascending=False)
insatisfaction.head(10)
Out[10]:
product_category_name Bad_reviews All_reviews Average_Insatisfaction
15 casa_conforto_2 5 30 0.166667
48 moveis_colchao_e_estofado 4 38 0.105263
7 audio 37 365 0.101370
37 fashion_underwear_e_moda_praia 13 131 0.099237
6 artigos_de_natal 13 153 0.084967
4 artes_e_artesanato 2 24 0.083333
14 casa_conforto 31 437 0.070938
60 portateis_cozinha_e_preparadores_de_alimentos 1 15 0.066667
26 dvds_blu_ray 4 64 0.062500
9 bebes 191 3073 0.062154

Conclusion